{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e49a44b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4e808d62",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['nanot5-small-malaysian-cased/checkpoint-1300000',\n",
       " 'nanot5-small-malaysian-cased/checkpoint-1310000',\n",
       " 'nanot5-small-malaysian-cased/checkpoint-1320000']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "checkpoints = sorted(glob('nanot5-small-malaysian-cased/checkpoint-*'))\n",
    "checkpoints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "89e8dad0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer, T5ForConditionalGeneration\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoints[-1])\n",
    "model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])\n",
    "model.config.eos_token_id = tokenizer.eos_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6165adee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 0)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config.decoder_start_token_id, model.config.pad_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6e45f7d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "s = \"\"\"\n",
    "Mohammad Najib bin Tun Haji Abdul Razak (Jawi: ‏محمد نجيب بن عبد الرزاق‎‎; lahir 23 Julai 1953) merupakan bekas Perdana Menteri Malaysia ke-6 dan bekas Menteri Besar Pahang ke-11. Beliau kini sedang menjalani hukuman penjara selama 12 tahun di Penjara Kajang atas sabitan kes SRC international.\n",
    "\"\"\".strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a2f0ad95",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/transformers/generation/utils.py:1421: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[' <triplet> Najib bin Tun Haji Abdul Razak <subj> 23 Julai 1953 <obj> tarikh lahir']\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(f'teks ke grafik pengetahuan: {s}{tokenizer.eos_token}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "print(tokenizer.batch_decode(outputs, skip_special_tokens = True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cb109c00",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [3232, 82, 2], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer('helo' + tokenizer.eos_token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b885022a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['SPECIAL_TOKENS_ATTRIBUTES',\n",
       " '__annotations__',\n",
       " '__call__',\n",
       " '__class__',\n",
       " '__delattr__',\n",
       " '__dict__',\n",
       " '__dir__',\n",
       " '__doc__',\n",
       " '__eq__',\n",
       " '__format__',\n",
       " '__ge__',\n",
       " '__getattribute__',\n",
       " '__gt__',\n",
       " '__hash__',\n",
       " '__init__',\n",
       " '__init_subclass__',\n",
       " '__le__',\n",
       " '__len__',\n",
       " '__lt__',\n",
       " '__module__',\n",
       " '__ne__',\n",
       " '__new__',\n",
       " '__reduce__',\n",
       " '__reduce_ex__',\n",
       " '__repr__',\n",
       " '__setattr__',\n",
       " '__sizeof__',\n",
       " '__str__',\n",
       " '__subclasshook__',\n",
       " '__weakref__',\n",
       " '_add_tokens',\n",
       " '_additional_special_tokens',\n",
       " '_auto_class',\n",
       " '_batch_encode_plus',\n",
       " '_bos_token',\n",
       " '_call_one',\n",
       " '_cls_token',\n",
       " '_compile_jinja_template',\n",
       " '_convert_encoding',\n",
       " '_convert_id_to_token',\n",
       " '_convert_token_to_id_with_added_voc',\n",
       " '_create_repo',\n",
       " '_decode',\n",
       " '_decode_use_source_tokenizer',\n",
       " '_encode_plus',\n",
       " '_eos_token',\n",
       " '_eventual_warn_about_too_long_sequence',\n",
       " '_eventually_correct_t5_max_length',\n",
       " '_from_pretrained',\n",
       " '_get_files_timestamps',\n",
       " '_get_padding_truncation_strategies',\n",
       " '_in_target_context_manager',\n",
       " '_mask_token',\n",
       " '_pad',\n",
       " '_pad_token',\n",
       " '_pad_token_type_id',\n",
       " '_processor_class',\n",
       " '_save_pretrained',\n",
       " '_sep_token',\n",
       " '_set_processor_class',\n",
       " '_switch_to_input_mode',\n",
       " '_switch_to_target_mode',\n",
       " '_tokenizer',\n",
       " '_unk_token',\n",
       " '_upload_modified_files',\n",
       " 'add_special_tokens',\n",
       " 'add_tokens',\n",
       " 'added_tokens_decoder',\n",
       " 'added_tokens_encoder',\n",
       " 'additional_special_tokens',\n",
       " 'additional_special_tokens_ids',\n",
       " 'all_special_ids',\n",
       " 'all_special_tokens',\n",
       " 'all_special_tokens_extended',\n",
       " 'apply_chat_template',\n",
       " 'as_target_tokenizer',\n",
       " 'backend_tokenizer',\n",
       " 'batch_decode',\n",
       " 'batch_encode_plus',\n",
       " 'bos_token',\n",
       " 'bos_token_id',\n",
       " 'build_inputs_with_special_tokens',\n",
       " 'can_save_slow_tokenizer',\n",
       " 'chat_template',\n",
       " 'clean_up_tokenization',\n",
       " 'clean_up_tokenization_spaces',\n",
       " 'cls_token',\n",
       " 'cls_token_id',\n",
       " 'convert_added_tokens',\n",
       " 'convert_ids_to_tokens',\n",
       " 'convert_tokens_to_ids',\n",
       " 'convert_tokens_to_string',\n",
       " 'create_token_type_ids_from_sequences',\n",
       " 'decode',\n",
       " 'decoder',\n",
       " 'default_chat_template',\n",
       " 'deprecation_warnings',\n",
       " 'encode',\n",
       " 'encode_plus',\n",
       " 'eos_token',\n",
       " 'eos_token_id',\n",
       " 'from_pretrained',\n",
       " 'get_added_vocab',\n",
       " 'get_special_tokens_mask',\n",
       " 'get_vocab',\n",
       " 'init_inputs',\n",
       " 'init_kwargs',\n",
       " 'is_fast',\n",
       " 'mask_token',\n",
       " 'mask_token_id',\n",
       " 'max_len_sentences_pair',\n",
       " 'max_len_single_sentence',\n",
       " 'max_model_input_sizes',\n",
       " 'model_input_names',\n",
       " 'model_max_length',\n",
       " 'name_or_path',\n",
       " 'num_special_tokens_to_add',\n",
       " 'pad',\n",
       " 'pad_token',\n",
       " 'pad_token_id',\n",
       " 'pad_token_type_id',\n",
       " 'padding_side',\n",
       " 'prepare_for_model',\n",
       " 'prepare_seq2seq_batch',\n",
       " 'pretrained_init_configuration',\n",
       " 'pretrained_vocab_files_map',\n",
       " 'push_to_hub',\n",
       " 'register_for_auto_class',\n",
       " 'sanitize_special_tokens',\n",
       " 'save_pretrained',\n",
       " 'save_vocabulary',\n",
       " 'sep_token',\n",
       " 'sep_token_id',\n",
       " 'set_truncation_and_padding',\n",
       " 'slow_tokenizer_class',\n",
       " 'special_tokens_map',\n",
       " 'special_tokens_map_extended',\n",
       " 'split_special_tokens',\n",
       " 'tokenize',\n",
       " 'train_new_from_iterator',\n",
       " 'truncate_sequences',\n",
       " 'truncation_side',\n",
       " 'unk_token',\n",
       " 'unk_token_id',\n",
       " 'verbose',\n",
       " 'vocab',\n",
       " 'vocab_files_names',\n",
       " 'vocab_size']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dir(tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ed0f2884",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<pad><s></s><unk>!'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([0,1,2,3,4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cc8de5b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
